In [1]:
import gc
import pandas as pd
import polars as pl
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pl.read_parquet('../data/train.parquet')
test = pl.read_parquet('../data/test.parquet')
In [3]:
df.head()
Out[3]:
shape: (5, 25)
DurationDistancePLongPLatdDLongDLatdHaversinePmonthPdayPhourPminPDweekDmonthDdayDhourDminDDweekTempPrecipWindHumidSolarSnowGroundTempDust
i64i64f64f64f64f64f64i64i64i64i64i64i64i64i64i64i64f64f64f64f64f64f64f64f64
28848037.530167127.00743937.535221127.0683985.40456141419485414201659.50.02.376.00.020.09.913.0
28461037.512104127.1077837.531013127.1423653.704593910202209102052023.40.01.848.00.00.022.517.0
43747037.557968126.83828737.557461126.8614582.04327333018154330190417.40.03.157.00.520.017.252.0
789037.610523127.05979937.615299127.0644680.671691924202909242037017.60.01.457.00.00.016.210.0
45518037.653015127.04699737.653015127.0469970.081520162815214228.40.01.569.00.010.026.90.0
In [4]:
df.describe()
Out[4]:
shape: (9, 26)
describeDurationDistancePLongPLatdDLongDLatdHaversinePmonthPdayPhourPminPDweekDmonthDdayDhourDminDDweekTempPrecipWindHumidSolarSnowGroundTempDust
strf64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64
"count"7.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e67.680911e6
"null_count"0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
"mean"25.8015423713.36358737.547609126.99082537.547692126.9907641.8698777.57923615.76576814.38011429.1500362.943937.57968415.76536514.42833729.9948242.94586318.561780.0178221.81931553.9784260.7298260.00998620.72195832.451052
"std"25.0429273957.1373150.044410.0825720.0444480.0832351.9955232.6967678.748756.24818717.258691.9759442.6967528.7484966.48361817.308831.9775469.4985080.3722620.97100817.4225910.9172150.16308312.65072224.071132
"min"1.01.037.437271126.79859937.437271126.7985990.01.01.00.00.00.01.01.00.00.00.0-17.80.00.010.00.00.0-13.60.0
"25%"8.01230.037.51424126.92003637.51424126.9193950.6756026.08.010.014.01.06.08.010.015.01.012.20.01.141.00.00.011.616.0
"50%"16.02280.037.54707126.99426337.546547126.9946821.2572818.016.016.029.03.08.016.016.030.03.020.10.01.753.00.270.021.127.0
"75%"36.04620.037.573242127.06189737.573242127.0624242.36308110.023.019.044.05.010.023.020.045.05.025.50.02.467.01.260.028.542.0
"max"119.033290.037.68972127.18026737.68972127.18026728.6344812.031.023.059.06.012.031.023.059.06.039.435.07.498.03.528.862.2304.0
In [5]:
test.describe()
Out[5]:
shape: (9, 26)
describeDurationDistancePLongPLatdDLongDLatdHaversinePmonthPdayPhourPminPDweekDmonthDdayDhourDminDDweekTempPrecipWindHumidSolarSnowGroundTempDust
strf64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64f64
"count"1.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e61.920228e6
"null_count"0.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
"mean"25.8085193714.13924237.547623126.99069637.54771126.9906261.8720647.57682615.77262114.36957729.1472492.943347.57725415.77182714.41764929.9845982.94540118.5629230.0179541.81910953.9645480.7310350.00980220.73098132.485708
"std"25.0449323954.5073780.0444290.0825640.0444730.0832471.9947112.6969178.745066.25346617.2570831.9765372.6968948.7449566.4879917.3126821.9779359.4963440.3790420.97109517.4175720.9186410.16170912.65816524.078638
"min"1.01.037.437271126.79859937.437271126.7985990.01.01.00.00.00.01.01.00.00.00.0-17.80.00.010.00.00.0-13.60.0
"25%"8.01230.037.51424126.91999137.51424126.9193950.6763786.08.010.014.01.06.08.010.015.01.012.20.01.141.00.00.011.616.0
"50%"16.02280.037.546848126.99426337.546547126.9942631.2588238.016.016.029.03.08.016.016.030.03.020.10.01.753.00.270.021.127.0
"75%"36.04630.037.573242127.06176837.573242127.0624242.36637410.023.019.044.05.010.023.019.045.05.025.50.02.467.01.260.028.642.0
"max"119.033290.037.68972127.18026737.68972127.18026724.97833412.031.023.059.06.012.031.023.059.06.039.435.07.498.03.528.862.2304.0
In [6]:
hist_train = go.Histogram(x=df['Duration'], name='Train')
hist_test = go.Histogram(x=test['Duration'], name='Test')

fig = make_subplots()

fig.add_trace(hist_train)
fig.add_trace(hist_test)

fig.update_layout(
    title='Distribution of Duration',
    barmode='overlay',
    xaxis_title_text='Duration',
    yaxis_title_text='Count'
)

fig.show()
  • トレーニングデータとテストデータのターゲット分布は概ね一致
  • 60前後でDurationが下がっている
In [7]:
del test
gc.collect()
Out[7]:
35
In [8]:
def reduce_mem_usage(df: pl.LazyFrame) -> pl.DataFrame:
    """iterate through all numeric columns of a dataframe and modify the data type to reduce memory usage.
    """

    for col in df.columns:
        col_type = df.select(col).dtypes

        if str(col_type)[1:4] == "Int":
            df = df.with_columns(pl.col(col).cast(pl.UInt16))

    return df
In [9]:
df = pl.scan_parquet('../data/train.parquet')
df = reduce_mem_usage(df).collect()
In [10]:
df.head()
Out[10]:
shape: (5, 25)
DurationDistancePLongPLatdDLongDLatdHaversinePmonthPdayPhourPminPDweekDmonthDdayDhourDminDDweekTempPrecipWindHumidSolarSnowGroundTempDust
u16u16f64f64f64f64f64u16u16u16u16u16u16u16u16u16u16f64f64f64f64f64f64f64f64
28848037.530167127.00743937.535221127.0683985.40456141419485414201659.50.02.376.00.020.09.913.0
28461037.512104127.1077837.531013127.1423653.704593910202209102052023.40.01.848.00.00.022.517.0
43747037.557968126.83828737.557461126.8614582.04327333018154330190417.40.03.157.00.520.017.252.0
789037.610523127.05979937.615299127.0644680.671691924202909242037017.60.01.457.00.00.016.210.0
45518037.653015127.04699737.653015127.0469970.081520162815214228.40.01.569.00.010.026.90.0
In [11]:
print(round(df.estimated_size('mb'), 1))
937.6

Duration¶

  • Durationのmaxが119minであることから2hour以上使用できないと考えられる
  • 1hourごとに料金が加算される可能性がある
  • もう少しDuraitonの分布を細かくみる必要がある
In [14]:
over50_duration = df[['Duration']].filter(pl.col('Duration') > 50)
hist = go.Histogram(x=over50_duration['Duration'], name='over50_duration')


fig = make_subplots()

fig.add_trace(hist)

fig.update_layout(
    title='Over 50 Duration',
    barmode='overlay',
    xaxis_title_text='Duration',
    yaxis_title_text='Count'
)

fig.show()
In [92]:
group_month = df.select(["Duration", "Pmonth"]).group_by("Pmonth").agg(
    pl.col("Duration").count().suffix("_count"),
    pl.col("Duration").sum().suffix("_sum"),
    pl.col("Duration").mean().suffix("_avg")
)

bar_count = go.Bar(
    x=group_month["Pmonth"], y=group_month["Duration_count"], name="Count",
) 
bar_sum = go.Bar(
    x=group_month["Pmonth"], y=group_month["Duration_sum"], name="Sum",
) 
bar_avg = go.Bar(
    x=group_month["Pmonth"], y=group_month["Duration_avg"], name="Avg.",
) 
box_month = go.Box(x=df.sample(fraction=0.01)['Pmonth'], y=df.sample(fraction=0.01)['Duration'])

fig = make_subplots(rows=2, cols=2,
                    subplot_titles=((
                        '<b>Bar of User Count</b>',
                        '<b>Bar of Total Duration</b>',
                        '<b>Bar of Duration Average</b>',
                        '<b>Box of Duration'
                                                   )))

fig.add_trace(bar_count, row=1, col=1)
fig.add_trace(bar_sum, row=1, col=2)
fig.add_trace(bar_avg, row=2, col=1)
fig.add_trace(box_month, row=2, col=2)

fig.update_layout(
    width=1500,
    height=500,
    margin=dict(t=15, b=5, l=5, r=5),
    template="plotly_white",
    showlegend=False
)
# x axis
fig.update_xaxes(title=dict(text='Month', font=dict(size=12, color='black')), row=1, col=1)
fig.update_xaxes(title=dict(text='Month', font=dict(size=12, color='black')), row=1, col=2)
fig.update_xaxes(title=dict(text='Month', font=dict(size=12, color='black')), row=2, col=1)
fig.update_xaxes(title=dict(text='Month', font=dict(size=12, color='black')), row=2, col=2)

# y axis
fig.update_yaxes(title=dict(text='Count', font=dict(size=12, color='black')), row=1, col=1)
fig.update_yaxes(title=dict(text='Minutes', font=dict(size=12, color='black')), row=1, col=2)
fig.update_yaxes(title=dict(text='Minutes / time', font=dict(size=12, color='black')), row=2, col=1)
fig.update_yaxes(title=dict(text='Minutes', font=dict(size=12, color='black')), row=2, col=2)

fig.update_annotations(font_size=12)
fig.show()
In [91]:
group_week = df.select(["Duration", "PDweek"]).group_by("PDweek").agg(
    pl.col("Duration").count().suffix("_count"),
    pl.col("Duration").sum().suffix("_sum"),
    pl.col("Duration").mean().suffix("_avg")
)

bar_count = go.Bar(
    x=group_week["PDweek"], y=group_week["Duration_count"], name="Count",
) 
bar_sum = go.Bar(
    x=group_week["PDweek"], y=group_week["Duration_sum"], name="Sum",
) 
bar_avg = go.Bar(
    x=group_week["PDweek"], y=group_week["Duration_avg"], name="Avg.",
) 
box_week = go.Box(x=df.sample(fraction=0.01)['PDweek'], y=df.sample(fraction=0.01)['Duration'], boxmean=True)
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=((
                        '<b>Bar of User Count</b>',
                        '<b>Bar of Total Duration</b>',
                        '<b>Bar of Duration Average</b>',
                        '<b>Box of Duration</b>'
                                                   )))

fig.add_trace(bar_count, row=1, col=1)
fig.add_trace(bar_sum, row=1, col=2)
fig.add_trace(bar_avg, row=2, col=1)
fig.add_trace(box_week, row=2, col=2)

fig.update_layout(
    width=1500,
    height=700,
    margin=dict(t=15, b=5, l=5, r=5),
    template="plotly_white",
    showlegend=False
)
# x axis
fig.update_xaxes(title=dict(text='Week', font=dict(size=12, color='black')), row=1, col=1)
fig.update_xaxes(title=dict(text='Week', font=dict(size=12, color='black')), row=1, col=2)
fig.update_xaxes(title=dict(text='Week', font=dict(size=12, color='black')), row=2, col=1)
fig.update_xaxes(title=dict(text='Week', font=dict(size=12, color='black')), row=2, col=2)

# y axis
fig.update_yaxes(title=dict(text='Count', font=dict(size=12, color='black')), row=1, col=1)
fig.update_yaxes(title=dict(text='Minutes', font=dict(size=12, color='black')), row=1, col=2)
fig.update_yaxes(title=dict(text='Minutes / time', font=dict(size=12, color='black')), row=2, col=1)
fig.update_yaxes(title=dict(text='Minutes / time', font=dict(size=12, color='black')), row=2, col=2)

fig.update_annotations(font_size=12)
fig.show()
In [90]:
group_hour = df.select(["Duration", "Phour"]).group_by("Phour").agg(
    pl.col("Duration").count().suffix("_count"),
    pl.col("Duration").sum().suffix("_sum"),
    pl.col("Duration").mean().suffix("_avg")
)

bar_count = go.Bar(
    x=group_hour["Phour"], y=group_hour["Duration_count"], name="Count",
) 
bar_sum = go.Bar(
    x=group_hour["Phour"], y=group_hour["Duration_sum"], name="Sum",
) 
bar_avg = go.Bar(
    x=group_hour["Phour"], y=group_hour["Duration_avg"], name="Avg.",
)
box_hour = go.Box(x=df.sample(fraction=0.01)['Phour'], y=df.sample(fraction=0.01)['Duration'], boxmean=True)

fig = make_subplots(rows=2, cols=2,
                    subplot_titles=((
                        '<b>Bar of User Count</b>',
                        '<b>Bar of Total Duration</b>',
                        '<b>Bar of Duration Average</b>',
                        '<b>Box of Duration</b>'
                                                   )))

fig.add_trace(bar_count, row=1, col=1)
fig.add_trace(bar_sum, row=1, col=2)
fig.add_trace(bar_avg, row=2, col=1)
fig.add_trace(box_hour, row=2, col=2)

fig.update_layout(
    width=1500,
    height=700,
    margin=dict(t=15, b=5, l=5, r=5),
    template="plotly_white",
    showlegend=False
)
# x axis
fig.update_xaxes(title=dict(text='Hour', font=dict(size=12, color='black')), row=1, col=1)
fig.update_xaxes(title=dict(text='Hour', font=dict(size=12, color='black')), row=1, col=2)
fig.update_xaxes(title=dict(text='Hour', font=dict(size=12, color='black')), row=2, col=1)
fig.update_xaxes(title=dict(text='Hour', font=dict(size=12, color='black')), row=2, col=2)

# y axis
fig.update_yaxes(title=dict(text='Count', font=dict(size=12, color='black')), row=1, col=1)
fig.update_yaxes(title=dict(text='Minutes', font=dict(size=12, color='black')), row=1, col=2)
fig.update_yaxes(title=dict(text='Minutes / time', font=dict(size=12, color='black')), row=2, col=1)
fig.update_yaxes(title=dict(text='Minutes / time', font=dict(size=12, color='black')), row=2, col=2)

fig.update_annotations(font_size=12)
fig.show()

Note¶

  • 月ごと -> 冬は利用者が少なそう
  • 曜日ごと -> 利用者はほぼ同じだが、土日のDurationが大きそう
  • 時間ごと -> 通勤時間(8時と18時)の利用者が多そう
In [93]:
del fig, group_week, group_month, bar_count, bar_sum, bar_avg, box_hour
gc.collect()
Out[93]:
18747
In [114]:
holidays = [5, 6]
df_sample = df.sample(fraction=0.1)
holidays_df = df_sample.filter(pl.col("PDweek").is_in(holidays))
weekdays_df= df_sample.filter(~pl.col("PDweek").is_in(holidays))

morning_df = df_sample.filter(pl.col("Phour") == 8)
evening_df = df_sample.filter(pl.col("Phour") == 18)

hist_all = go.Histogram(
    x=df_sample["Duration"], name="Raw", histnorm='probability'
)
hist_holidays = go.Histogram(
    x=holidays_df["Duration"], name="Holidays", histnorm='probability'
)
hist_weekdays = go.Histogram(
    x=weekdays_df["Duration"], name="Weekdays", histnorm='probability'
)
hist_morning = go.Histogram(
    x=morning_df["Duration"], name="Morning", histnorm='probability'
)
hist_evening = go.Histogram(
    x=evening_df["Duration"], name="Evening", histnorm='probability'
)
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=((
                        '<b>Compare Weekdays and Holidays</b>',
                        '<b>Compare morning and evening</b>',
                                                   )))

fig.add_trace(hist_all, row=1, col=1)
fig.add_trace(hist_holidays, row=1, col=1)
fig.add_trace(hist_weekdays, row=1, col=1)

fig.add_trace(hist_all, row=1, col=2)
fig.add_trace(hist_morning, row=1, col=2)
fig.add_trace(hist_evening, row=1, col=2)


fig.update_layout(
    # barmode="overlay",
    margin=dict(t=15, b=5, l=5, r=5),
    template="plotly_white",
)
# x axis
fig.update_xaxes(title=dict(text='Duration', font=dict(size=12, color='black')), row=1, col=1)
fig.update_xaxes(title=dict(text='Duration', font=dict(size=12, color='black')), row=1, col=2)

# # y axis
fig.update_yaxes(title=dict(text='Prob.', font=dict(size=12, color='black')), row=1, col=1)
fig.update_yaxes(title=dict(text='Prob.', font=dict(size=12, color='black')), row=1, col=2)

fig.update_annotations(font_size=12)
fig.update_traces(opacity=0.5)
fig.show()
  • 休日はDurationが大きい利用者が多い
  • 8時台は10分以内の利用者が多く、20分以上の利用者が少ない
  • 18時台は20分以上の利用者が多い傾向にある
In [117]:
sns.jointplot(data=df_sample, x="Temp", y="Duration", kind="hist")
Out[117]:
<seaborn.axisgrid.JointGrid at 0x7f1856dd91e0>
In [118]:
sns.jointplot(data=df_sample.filter(pl.col("Duration") < 60), x="Temp", y="Duration", kind="hist")
Out[118]:
<seaborn.axisgrid.JointGrid at 0x7f19253cee90>
In [ ]: